library(SingleCellExperiment)
library(ggplot2)
library(scFeatures)
library(ClassifyR)
library(lisaClust)
library(ggthemes)
library(spicyR)
library(dplyr)
library(limma)
library(plotly)
library(scattermore)
library(tidyr)
library(survival)
library(survminer)
library(spatstat.geom)
library(scater)
library(scran)
library(SPOTlight)
library(reshape)
theme_set(theme_classic())
## these are some codes for plotting 


plot_boxplot <- function( feature ){
  
  data <- t(feature)

  patient <- unlist( lapply( strsplit( colnames(data), "_cond_"), `[`, 1) ) 
  condition  <- unlist( lapply( strsplit( colnames(data), "_cond_"), `[`, 2))
  condition <- data.frame(condition = condition )
      
  design <- model.matrix(~condition, data = condition)
  fit <- lmFit(data, design)
  fit <- eBayes(fit)
  tT <- topTable(fit, n = Inf) 
  
  
  # selecting the top 10 DE features 
  top_gene <- rownames( tT)[1:10 ]
  rownames( condition) <- colnames(data)
   
  data_plot <-  data[top_gene, ]   
  
  data_plot <- melt(data_plot )
  
  colnames(data_plot) <- c("X1", "X2", "value")
   
  data_plot$condition <- unlist( lapply( strsplit(  as.character( data_plot$X2), "_cond_"), `[`, 2))
  
  
  p <- ggplot(data_plot, aes( x = X1, y = value , colour = condition)) + 
    geom_boxplot()  + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  
   
  return(p)
  
}
  


plot_barplot <- function(data , dodge=F  ){
  

  data$patient <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 1))
  data$condition <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 2))
  
  data <- as.data.frame( melt(data, id=c("patient", "condition")) )
 
 
  p <-   ggplot(data , aes( x = patient , y = value , fill = variable) ) +   
    geom_bar(stat="identity"   ) + facet_wrap(~condition, scale="free") + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))  + 
    scale_fill_manual(values = color_codes)
 
  
 
 return (p)
  
 
 
}

1 Overview

As single cell technology advances, the recent development of spatial omics allows us to examine the spatial organisation of cells within tissues in their native environment. This workshop will discuss the challenges and analytical focus associated with using multi-sample spatial datasets for disease prediction. We will also talk about general analytical strategies and the critical thinking questions that arise in the workflow.


Preparation and assumed knowledge

Learning objectives

  • Describe and visualise spatial omics datasets
  • Calculate different measures that describe the spatial distribution of cell types
  • Generate individual feature representations from a cell-level expression matrix
  • Perform multi-view disease outcome prediction with the package ClassifyR
  • Develop understanding on
    • how to assess the performance of classification models
  • Explore various strategies for disease outcome prediction using spatial omics data

Note: This data analysis workshop offers participants the opportunity to engage in hands-on analysis using R. However, if you are not comfortable with coding in R, you can still acquire valuable interpretation skills by reviewing the output we provide in this file.

1.0.1 Time outline

Structure of the 3-hour workshop:

Activity Time
Introduction 15m
Exploring spatial data 35m
Feature engineering with scFeatures 50m
Break 15m
Survival analysis with ClassifyR 45m
Identify cohort heterogeneity 20m
Concluding remarks

2 Initial exploration and visualisation

Data and background

In this demo, we look at a Visium dataset taken from Kuppe, C., Ramirez Flores, R. O., Li, Z., Hayat, S., Levinson, R. T., Liao, X., … & Kramann, R. (2022). Spatial multi-omic map of human myocardial infarction. Nature, 608(7924), 766-777.

Visium captures spatial information, creating images that display the distribution of different cell types and their associated gene expression patterns in the tissue.

In this dataset, the authors quantified the expression of >10000 genes in control and in patients with myocardial infarction. In this demo, we examine patients defined to be myogenic group and ischaemic group. Myogenic group is defined by sample taken from control and unaffected remote zone, ischaemic group is defined by sample taken from ischaemic zone.

2.1 Exploration 1: How complex is my data ?

Examine the data objects:
- The dataset contains 11,681 genes and 19,000 cells (after subsampling).
- The outcome is 11 myogenic samples (4 from control, 7 from remote zone) and 8 ischaemic samples.

data_sce <- readRDS("~/data/small_data.rds")
 
data_sce
## class: SpatialExperiment 
## dim: 11681 19000 
## metadata(3240): X_approximate_distribution X_normalization ...
##   schema_version title
## assays(2): X logcounts
## rownames(11681): LINC01409 LINC01128 ... VAMP7 AC007325.4
## rowData names(1): features
## colnames(19000): ACAGAACTGAGAACAA-1 TTGCCCTGATCACGGG-1 ...
##   CTGACATAGAAATAGA-1.16 ATTAGGCGATGCTTTC-1.24
## colData names(33): n_counts n_genes ... number_cells condition
## reducedDimNames(3): X_pca X_spatial X_umap
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(0) :
## imgData names(1): sample_id
## Expression matrix is stored in genes by cells matrix
logcounts(data_sce)[1:7, 1:7]
## 7 x 7 sparse Matrix of class "dgCMatrix"
##           ACAGAACTGAGAACAA-1 TTGCCCTGATCACGGG-1 CATATAGGTACAGTCA-1
## LINC01409           .                  .                  .       
## LINC01128           1.431438           .                  1.351835
## NOC2L               .                  .                  .       
## KLHL17              .                  .                  .       
## HES4                .                  .                  .       
## ISG15               .                  1.279462           .       
## AGRN                .                  .                  .       
##           TGCGAGAATATTACCC-1 TATTCAATTCTAATCC-1 GTCCTATTGTTGTGGT-1
## LINC01409           .                  .                  .       
## LINC01128           .                  .                  .       
## NOC2L               .                  .                  .       
## KLHL17              .                  .                  .       
## HES4                1.146986           1.389528           .       
## ISG15               .                  .                  1.454652
## AGRN                .                  .                  .       
##           TAACCTACCGTCCGAG-1
## LINC01409           .       
## LINC01128           .       
## NOC2L               .       
## KLHL17              .       
## HES4                .       
## ISG15               1.475897
## AGRN                .
## The object stores meta data (such as patient outcome information) about each cell
## the metadata is stored in colData() for a spatialexperiment object.
# we need to convert to a data.frame to be input into datatable function
DT::datatable( data.frame(colData(data_sce))[1:5, ], options = list(scrollX = TRUE))

2.2 Exploration 2: How to visualise my data ?

Typically in a spatial data, we perform dimension reduction to reduce and project the high dimensional cell by gene matrix on to 2D space. This allows us to visualise various things of interest, such as distribution of cell types and disease outcomes.

Visium is a spot-based technology, meaning that each spot capture 1-10 cells. Therefore each spot may represent a mixture of cells from different cell types. In this dataset, the author used cell2location which is a spatial deconvolution tool to predict the cell type probability or composition of each of the spot. There are also many other deconvolution tools available, eg, CARD published in Ma, Y., & Zhou, X. (2022). Spatially informed cell-type deconvolution for spatial transcriptomics. Nature biotechnology, 40(9), 1349-1359.

For demonstration purpose, in the plot below we visualise each spot using the cell type with maximum probability.

data_sce <- runUMAP(data_sce, exprs_values = "logcounts", scale=TRUE, min_dist = 0.3)
## To highlight can highlight by [EXPLAIN]

a <- plotUMAP(data_sce, colour_by = "celltype")
b <- plotUMAP(data_sce, colour_by = "condition")
c <- plotUMAP(data_sce, colour_by = "sample")

## this allows us to combine multiple plot in a grid format 
ggarrange(plotlist = list(a,b,c))

Interactive Q&A:

  • Q1: Is there patient batch effect?
  • Q2: Are the myogenic group and ischaemic group easy or difficult to distinguish?
  • Any interesting patterns from the plot?

2.3 Exploration 3: Is there a spatial structure in my data?

The advantage with spatial omics is that we can examine the organisation of the cell types as it occurs on the tissue slide. Here we visualise one of the slides from a selected patient. As an optional exercise, you can:

  • permute the cell type label.
  • permute the spatial coordinate.

to give a sense of what is random ordering.

# SET UP code for visualising 

# this section of the code is to set consistent colour for each cell type throughout this rmarkdown
# we use the colour palette Tableau
# because it only has 10 colours and we have 11 cell types, we colour the last cell type grey. 
celltype <- c("Adipocyte" , "Cardiomyocyte" , 
              "Endothelial" ,  "Fibroblast" , "Lymphoid" ,    
              "Mast" ,   "Myeloid" , 
              "Neuronal" , "Pericyte" , "Cycling.cells", "vSMCs" )

tableau_palette <- scale_colour_tableau() 
color_codes <-  c( tableau_palette$palette(10) , "lightgrey")
names(color_codes) <- celltype

Here we choose a particular patient “IZ_P9_cond_Ischaemic” (a patient in the ischaemic group) and visualise its spatial pattern.

# select one sample to visualise  
one_sample_data <- data_sce[, data_sce$sample  ==  "IZ_P9_cond_Ischaemic"]
one_sample  <- data.frame( colData(one_sample_data) )
a <- ggplot(one_sample, aes(x = spatial_x , y = spatial_y, colour = celltype)) + geom_point(alpha=0.7) +  scale_colour_manual(values = color_codes) + ggtitle("Original slide")
## Optional: Permute the cell type labels

one_sample$celltype_permute <- sample(one_sample$celltype)
b <- ggplot(one_sample, aes(x = spatial_x , y =  spatial_y, colour =celltype_permute)) + geom_point(alpha=0.7) +  scale_colour_manual(values = color_codes)  + ggtitle("Permute the cell type label")
## Optional: Permute the spatial coordinate

one_sample$spatial_x_permute <- sample(one_sample$spatial_x)
one_sample$spatial_y_permute <- sample(one_sample$spatial_y)

c <- ggplot(one_sample, aes(x = spatial_x_permute , y = spatial_y_permute, colour = celltype)) + geom_point(alpha=0.7) +  scale_colour_manual(values = color_codes) + ggtitle("Permute the X, Y coordinate")


ggarrange(plotlist = list(a,b,c), ncol =3 )

Comparing the above plot shows spatial randomness.

2.4 Exploration 4: Visium specific visualisation

Instead of plotting the cell type with maximum probability, we can also visualise the cell type composition of each spot using pie chart.

x <- data.frame( imagecol = one_sample$spatial_y, 
                 imagerow = one_sample$spatial_x)
 
rownames(x) <- paste0("Spot", 1:nrow(x))

y <- data.frame( colData(one_sample_data)[ ,celltype  ] )
 
rownames(y) <- paste0("Spot", 1:nrow(y))


## we use a function to plot the piechart
plotSpatialScatterpie(x = x, y = y , pie_scale = 0.7) + theme_classic() +  scale_fill_manual(values = color_codes) + ylab("spatial_y") + xlab("spatial_y")

Interactive Q&A:

Q3: Is there a structure in the data or is the cell type randomly distribution?

3 Describing tissue microenvrionments and cellular neighbourhoods

3.1 Do cell type co-localise in specfic regions?

We begin by examine how can we identify and visualise regions of tissue where spatial associations between cell-types is similar? There are many packages that perform this task andhere we use the lisaClust function [https://www.bioconductor.org/packages/devel/bioc/html/lisaClust.html] that is based on “local L-function” to spatially clusters cells into different regions with similar cell type composition.

set.seed(51773)
 

BPPARAM <- simpleSeg:::generateBPParam(2)

# Cluster cells into spatial regions with similar composition.
data_sce  <- lisaClust(
  data_sce ,
  k = 5,
  Rs = c(20, 50, 100),
  sigma = 50,
  spatialCoords = c("spatial_x", "spatial_y"),
  cellType = "celltype",
  imageID = "sample" ,
  regionName = "region",
  BPPARAM = BPPARAM
)

3.2 Which regions appear to be different between the ischaemic and myogenic patients?

3.2.1 Visualise regions on individual level

Using the slide we previously shown, visualise the region output.

metadata <- colData(data_sce)
metadata <- metadata[ metadata$sample == "IZ_P9_cond_Ischaemic",  ]
metadata <- data.frame(metadata)
metadata$celltype  <- as.character(metadata$celltype )

plotlist <- list()
plotlist_celltype <- list()
thisregion  <-  unique(metadata$region)[1]


tableau_palette <- scale_colour_tableau() 
color_codes <- tableau_palette$palette( 10 )
color_codes <- c(color_codes, "darkgrey" , "grey90") 

names(color_codes) <- c( celltype ,  "other regions")


# put this in a function 
# show the hatching plot as well 
for ( thisregion in sort(unique(metadata$region))){
        
        selected_region_index <-  metadata$region == thisregion
        
        metadata$selected_region <-  "other regions"
        metadata$selected_region[selected_region_index] <- "selected region"
        
        metadata$selected_celltype <- metadata$celltype
        metadata$selected_celltype[!selected_region_index ] <-   "other regions"
        
        # metadata$celltype <- factor(metadata$celltype, levels = c(unique(metadata$celltype), "other regions"))

       p <- ggplot(metadata, aes(x = spatial_x , y = spatial_y, colour = selected_region  )) + 
         geom_point( alpha = 0.8 ) + ggtitle(thisregion) + scale_colour_manual(values = c("grey" , "red"))
         
       
    
       p2 <-  ggplot(metadata, aes(x = spatial_x , y = spatial_y, colour =  selected_celltype )) +
         geom_point(alpha = 0.8 ) + ggtitle(thisregion) + scale_colour_manual(values =  color_codes)
       
      plotlist [[thisregion ]] <- p
       
      plotlist_celltype [[thisregion ]] <- p2
}

ggarrange(plotlist = plotlist , ncol = 5, nrow = 1 , common.legend = T )

ggarrange(plotlist = plotlist_celltype , ncol = 5, nrow = 1 , common.legend = T )

3.2.2 Visualise regions across patients

We can better interpret the region output by summarising the proportion of each cell type in a region across the patients.

Looking at the composition of cell types in each region, comparing between ischaemic and myogenic.

df <- data.frame(colData( data_sce))
 

# comment every line of the code 

df_plot <- NULL
for ( thispatient in unique(df$sample)){
  this_df <- df[df$sample == thispatient, ]
  temp_df <-   table( this_df$region, this_df$celltype )
  temp_df <-  temp_df / rowSums(temp_df)
   temp_df <- data.frame(  temp_df)
  temp_df$patient <-  thispatient
  temp_df$reponse <- unique( this_df$condition)
  df_plot <- rbind(df_plot, temp_df)
}

df_plot <- df_plot %>% dplyr::group_by( Var1 , Var2, reponse) %>% 
  summarise(mean_proportion = mean(Freq))
  
 
ggplot(df_plot, aes(y = Var2, x = Var1 ,colour =mean_proportion  , size = mean_proportion ))+  geom_point() + 
  facet_grid(~reponse, scales = "free", space = "free" ) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  xlab("Region" ) + ylab("Celltype") + scale_colour_viridis_c()

Interactive Q&A:

Q4: Which regions would you focus on next ?

df <- data.frame(colData( data_sce))

df <- df %>% dplyr::group_by(sample , condition , region) %>%
  count(celltype) %>%
  mutate(proportion = n / sum(n))


ggplot(df, aes(y = proportion, x = sample , fill = celltype))+ geom_col()+facet_grid(~region+condition, scales = "free", space = "free" ) + scale_fill_manual(values = c(color_codes))  +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Interactive Q&A:

Q5: Does your conclusion change after looking at a different plot

3.3 Further exploration by visualise selected regions

Let’s focus on region 2 and visualise the boxplot of cell type distribution.

df <- data.frame(colData( data_sce))



df_plot <- NULL
for ( thispatient in unique(df$sample )){
  this_df <- df[df$sample == thispatient, ]
  temp_df <-   table( this_df$region, this_df$celltype)
  temp_df <-  temp_df / rowSums(temp_df)
  temp_df <- data.frame(  temp_df)
  temp_df$patient <-  thispatient
  temp_df$reponse <- unique( this_df$condition )
  df_plot <- rbind(df_plot, temp_df)
}

df_plot_region_1 <- df_plot[df_plot$Var1 == "region_2", ]
 
a <- ggplot(df_plot_region_1, aes(x =  Var2,  y = Freq, colour = reponse)) +
  geom_boxplot()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  ylab("Proportion") + xlab("Cell type")+ ggtitle("Region 2") + ylim(0,1)

a

Discussion:

Comparing the cell type composition in the region, what can you tell about the regions?

4 How do we generate molecular representation for each individual ?

In this demo, we use scFeatures to generate molecular representation for each patient. The molecular representation is interpretable and hence facilitates downstream analysis of the patient. Overall, scFeatures generates features across six categories representing different molecular views of cellular characteristics. These include: - i) cell type proportions - ii) cell type specific gene expressions - iii) cell type specific pathway expressions - iv) cell type specific cell-cell interaction (CCI) scores - v) overall aggregated gene expressions - vi) spatial metrics The different types of features constructed enable a more comprehensive multi-view understanding of each patient from a matrix of proteins x cells.

print("number of cells in each sample")
## [1] "number of cells in each sample"
table(data_sce$sample)
## 
##  control_P1_cond_Myogenic control_P17_cond_Myogenic  control_P7_cond_Myogenic 
##                      1000                      1000                      1000 
##  control_P8_cond_Myogenic     IZ_P10_cond_Ischaemic     IZ_P13_cond_Ischaemic 
##                      1000                      1000                      1000 
##     IZ_P15_cond_Ischaemic     IZ_P16_cond_Ischaemic      IZ_P2_cond_Ischaemic 
##                      1000                      1000                      1000 
##      IZ_P3_cond_Ischaemic      IZ_P9_cond_Ischaemic IZ_P9_rep2_cond_Ischaemic 
##                      1000                      1000                      1000 
##      RZ_P11_cond_Myogenic      RZ_P12_cond_Myogenic       RZ_P2_cond_Myogenic 
##                      1000                      1000                      1000 
##       RZ_P3_cond_Myogenic       RZ_P5_cond_Myogenic       RZ_P6_cond_Myogenic 
##                      1000                      1000                      1000 
##       RZ_P9_cond_Myogenic 
##                      1000
# metadata <- data.frame( table(data_sce$sample) )
# colnames(metadata)[1] <- "Patient"
# DT::datatable(metadata , options = list(pageLength = 5), width = "400px")

print("number of cells in each celltype")
## [1] "number of cells in each celltype"
table(data_sce$celltype)
## 
## Cardiomyocyte Cycling.cells   Endothelial    Fibroblast          Mast 
##          8782          1755          1878          3714            41 
##       Myeloid      Pericyte         vSMCs      Lymphoid      Neuronal 
##          2421            44           266            40            34 
##     Adipocyte 
##            25
# metadata <- data.frame( table(data_sce$celltype) )
# colnames(metadata)[1] <- "Region specific cell type"
# DT::datatable(metadata , options = list(pageLength = 5), width = "400px")
#   

Discussion:

Is there any sample or cell types you would like to remove from the data?

4.1 How to create one molecular representations for an individual?

There are different ways you can use scFeatures to generate molecular representation for patients.

4.1.1 Selected features of interest

scFeatures can also generate selected feature types. For example, in the below code we run scFeatures to generate cell type proportion.

#scFeatures requires the following columns 
#  data, sample, x_cord and y_cord
data <- logcounts( data_sce )
sample <- data_sce$sample
spatialCoords <- list(data_sce$spatial_x, data_sce$spatial_y)


# for spatial transcriptomics, we also need to specify the cell type prediction of each spot 
prediction.scores <- colData(data_sce)[, celltype]
prediction.scores <- as.matrix( t(prediction.scores)  )


scfeatures_result <- scFeatures(data = data ,  sample = sample, 
                spatialCoords = spatialCoords , spotProbability = prediction.scores, 
                feature_types = "proportion_raw",
                type ="spatial_t" )
feature <- scfeatures_result$proportion_raw
plot_barplot( feature ) + ggtitle("Proportion raw feature")

4.1.2 All cell types and features in one line of code

The code below generates all feature types for all cell types, to save time we won’t run it in today’s workshop, please just load the prepared RDS file in the following chunk.

scfeatures_result <- scFeatures(data = data ,  sample = sample, 
                spatialCoords = spatialCoords , spotProbability = prediction.scores, 
                type ="spatial_t", ncores = 10)
scfeatures_result <- readRDS("~/data/scfeatures_result_matrix.rds")

4.2 How to explore scFeatures output?

From the section above, we have generated a total of 13 feature types and stored them in a list. All generated feature types are stored in a matrix of samples by features. For example, the first list element contains the feature type “proportion_raw”, which contains the cell type proportion features for each patient sample. We could print out the first 5 columns and first 5 rows of the first element to see.

scfeatures_result <- readRDS("~/data/scfeatures_result_matrix.rds")
 
# we have generated a total of 13 feature types
names(scfeatures_result )
##  [1] "proportion_raw"       "proportion_logit"     "proportion_ratio"    
##  [4] "gene_mean_celltype"   "pathway_mean"         "gene_mean_bulk"      
##  [7] "gene_prop_bulk"       "gene_cor_bulk"        "L_stats"             
## [10] "celltype_interaction" "morans_I"             "nn_correlation"
# each row is a sample, each column is a feature 

meta_table <- data.frame(round ( scfeatures_result [[1]][1:5, 1:5] , 2))
DT::datatable(meta_table , options = list(scrollX = TRUE))

4.2.1 Visually exploring features

Once the features are generated, you may wish to visually explore the features.

Here we plot a volcano plot and a dotplot for the cell type specific expression feature.

gene_mean_bulk <- scfeatures_result$gene_mean_bulk
# this transposes the data
# in bioinformatics conversion, features are stored in rows 
# in statistics convention, features are stored in columns
gene_mean_bulk <- t(gene_mean_bulk)
      
 
condition  <- unlist( lapply( strsplit( colnames(gene_mean_bulk), "_cond_"), `[`, 2))
condition <- data.frame(condition = condition )
design <- model.matrix(~condition, data = condition)
fit <- lmFit(gene_mean_bulk, design)
fit <- eBayes(fit)
tT <- topTable(fit, n = Inf)
tT$gene <- rownames(tT)
p <- ggplot( tT , aes(logFC,-log10(P.Value) , text = gene ) )+
      geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
      scale_colour_gradient(low="blue",high="red")+
      xlab("log2 fold change") + ylab("-log10 p-value")  
 
p

tT <- tT[ order(tT$logFC, decreasing = T), ]
tT <- tT[1:20, ]
ggplot( tT , aes( y = reorder(gene, logFC) , x = logFC  ) )+
      geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=4) +
      scale_colour_gradient(low="blue",high="red")+
      xlab("logFC") + ylab("region specific cel type specfic features" ) 

Interactive Q&A:

Q6: Which figure do you prefer? The volcano plot or the dotplot?

4.2.2 Visualise distribution of features across patients

We can also visualise the distribution of the features across the patients. This helps to examine whether there exists any batch effect due to patient effect.

4.2.2.1 Proportion raw

feature <- scfeatures_result$proportion_raw
 
plot_barplot(feature ) + ggtitle("Proportion raw feature")

4.2.2.2 Gene mean celltype

feature <- scfeatures_result$gene_mean_celltype
feature   <- feature[  , grep("Cardio", colnames(feature)) ]
plot_boxplot(feature) + ggtitle("Gene mean celltype feature")

4.2.2.3 Pathway mean

feature <- scfeatures_result$pathway_mean
plot_boxplot(feature) + ggtitle("Pathway mean feature")

4.2.2.4 Gene mean bulk

feature <- scfeatures_result$gene_mean_bulk
plot_boxplot(feature) + ggtitle("Gene mean bulk feature")

4.2.2.5 Nearest neighbour correlation

feature <- scfeatures_result$nn_correlation
plot_boxplot(feature) + ggtitle("Nearest neighbour correlation feature")

4.2.3 Automatic feature visualisation

To accommodate for easier interpretation of the features, scFeatures contains a function run_association_study_report that enables the user to readily visualise and explore all generated features with one line of code.

# specify a folder to store the html report. Here we store it in the current working directory. 
output_folder <-  getwd()
run_association_study_report(scfeatures_result, output_folder )

4.3 Are the generated features sensible ?

Interactive Q&A:

Using the HTML, we can look at some of the critical thinking questions that a researcher would ask about the generated features. These questions are exploratory and there is no right or wrong answer.

Q7: Do the generated features look reasonable?
Which cell type(s) would you like to focus on at your next stage of analysis?
Which feature type(s) would you like to focus on at your next stage of analysis?
Q8: Are the conditions in your data relatively easy or difficult to distinguish?

5 Can we classify or discrimiante between the ischaemic and myogenic samples ?

In this section we build a classification model to predict whether the samples belong to the ischaemic or myogenic group.

5.1 Building classification model

Recall in the previous section that we have stored the 13 feature types matrix in a list. Instead of manually retrieving each matrix from the list to build separate models, classifyR can directly take a list of matrices as an input and run repeated cross-validation model on each matrix individually.

Below, we run 5 repeats of 3 folds cross-validation.

outcome = scfeatures_result[[1]] %>% rownames %>% strsplit("_cond_") %>% sapply(function(x) x[2])
table(outcome)

### generate classfyr result 
classifyr_result <- crossValidate(scfeatures_result,
                                 outcome, 
                                 classifier = "kNN",
                                 nFolds = 3, 
                                 nRepeats = 5, 
                                 nCores = 5  )

5.2 Visualising the classification performance

To examine the classification model performance, we first need to specify a metric to calculate. Here, we calculate the balanced accuracy.

classifyr_result <-  readRDS("~/data/classifyr_result.rds")

classifyr_result <- lapply(classifyr_result, 
                           function(x) calcCVperformance(x, performanceType = "Balanced Accuracy"))

Format the output and visualise the accuracy using boxplots.

level_order <- names(scfeatures_result)

p  <- performancePlot(classifyr_result) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  scale_x_discrete(limits = level_order)  

p

6 Appendix

6.1 Explanation of spatial features

  • L function:

The L function is a spatial statistic used to assess the spatial distribution of cell types. It assess the significance of cell-cell interactions, by calculating the density of a cell type with other cell types within a certain radius. High values indicate spatial association, low values indicate spatial avoidance.

tableau_palette <- scale_colour_tableau() 
color_codes <- tableau_palette$palette( 10 )
color_codes <- c(color_codes, "darkgrey" , "grey90") 

names(color_codes) <- c( celltype ,  "other regions")
 


one_sample_data  <- data_sce[ , data_sce$sample == "IZ_P10_cond_Ischaemic"  ]

one_sample <- data.frame( colData(one_sample_data) )
index <-  one_sample$celltype  %in% c("Cardiomyocyte", "Myeloid")
one_sample$celltype <- as.character(one_sample$celltype)
one_sample$celltype[!index] <- "others"
a <- ggplot( one_sample, aes(x = spatial_x , y = spatial_y, colour = celltype )) + geom_point()  + scale_colour_manual(values = c("steelblue", "coral", "grey"))  + ggtitle( "Patient P10 - high L value with \n cardiomyocyte interacting myeloid")
 
one_sample <- data.frame( colData(one_sample_data) )
index <-  one_sample$celltype  %in% c("Cardiomyocyte", "Cycling.cells")
one_sample$celltype <- as.character(one_sample$celltype)
one_sample$celltype[!index] <- "others"
b <- ggplot( one_sample, aes(x = spatial_x , y = spatial_y, colour = celltype )) + geom_point() + scale_colour_manual(values = c("steelblue", "coral", "grey"))  + ggtitle( "Patient 10 - low L value with  \n cardiomyocyte interacting cycling cells")
 
ggarrange(plotlist = list( a , b))

  • Cell type interaction composition:

We calculate the nearest neighbours of each cell and then calculate the pairs of cell type based on the nearest neighbour. This allow us to summarised it into a cell type interaction composition.

tableau_palette <- scale_colour_tableau() 
color_codes <- tableau_palette$palette( 10 )
color_codes <- c(color_codes, "darkgrey" , "grey90") 

names(color_codes) <- c( unique(data_sce$celltype)  )
 

tableau_palette <- scale_colour_tableau() 
color_codes <-  c( tableau_palette$palette(10) , "lightgrey")
names(color_codes) <- celltype



one_sample  <- data_sce[ , data_sce$sample == "IZ_P10_cond_Ischaemic"  ]
one_sample <- data.frame( colData(one_sample) )
 
a <- ggplot( one_sample, aes(x = spatial_x , y = spatial_y, colour = celltype )) + geom_point( alpha = 0.8)  + scale_colour_manual(values = color_codes)  + ggtitle( "Patient P10")


feature  <- scfeatures_result$celltype_interaction
to_plot <- data.frame( t( feature["IZ_P10_cond_Ischaemic" , ])  )
to_plot$feature <- rownames(to_plot) 
colnames(to_plot)[2] <- "celltype interaction composition"
 to_plot <- to_plot[ order(to_plot$IZ_P10_cond_Ischaemic , decreasing = T), ]
 to_plot <-  to_plot[1:5 , ]
 # to_plot <- to_plot[ to_plot$IZ_P10_cond_Ischaemic  > 0.03 , ] 
b <- ggplot(to_plot, aes( x =  reorder(`celltype interaction composition`, IZ_P10_cond_Ischaemic) ,  y = IZ_P10_cond_Ischaemic  ,fill = `celltype interaction composition`)) + geom_bar(stat="identity", fill = "steelblue" ) + ylab("Cell type interactions value")  + xlab("Major cell type interactions") + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 

ggarrange(plotlist = list( a , b))

  • Moran’s I:

Moran’s I is a spatial autocorrelation statistic based on both location and values. It quantifies whether similar values tend to occur near each other or dispersed.

high  <- data_sce["PDGFD", data_sce$sample == "IZ_P16_cond_Ischaemic"  ]
high_meta <- data.frame( colData(high) ) 
high_meta$expression <- as.vector(logcounts( high)) 

low  <- data_sce["KIF22"  , data_sce$sample == "IZ_P16_cond_Ischaemic" ]
low_meta <- data.frame( colData(low) )
low_meta$expression <- as.vector(logcounts(low))


a <- ggplot(low_meta, aes(x = spatial_x , y = spatial_y, colour =expression)) + geom_point(alpha=0.5) + scale_colour_viridis_c() + ggtitle("Patient P16 - low Moran's I in KIF22")

b <- ggplot(high_meta, aes(x = spatial_x , y = spatial_y, colour =expression)) + geom_point(alpha=0.5) + scale_colour_viridis_c() + ggtitle("Patient P16 - high Moran's I in PDGFD")

ggarrange(plotlist = list( a , b))

  • Nearest Neighbor Correlation:

This metric measures the correlation of proteins/genes between a cell and its nearest neighbor cell.

Here we pick the gene “COL1A1” as an example to illustrate the concept.

plot_nncorrelation <- function(thissample , thisprotein){
   
       sample_name <- thissample
       thissample <- data_sce[, data_sce$sample ==     sample_name]
    
      
      exprsMat <- logcounts(thissample)
     
    
    cell_points_cts <- spatstat.geom::ppp(
            x = as.numeric(thissample$spatial_x), y = as.numeric(thissample$spatial_y),
            check = FALSE,
            xrange = c(
                min(as.numeric(thissample$spatial_x)),
                max(as.numeric(thissample$spatial_x))
            ),
            yrange = c(
                min(as.numeric(thissample$spatial_y)),
                max(as.numeric(thissample$spatial_y))
            ),
            marks = t(as.matrix(exprsMat))
        )
    
     value <-  spatstat.explore::nncorr(cell_points_cts)["correlation", ]
      value <-  value[  thisprotein]
     
    # Find the indices of the two nearest neighbors for each cell
    nn_indices <- nnwhich(cell_points_cts, k = 1)
    
    protein <-  thisprotein
    df <- data.frame(thiscell_exprs  = exprsMat[protein, ] , exprs =  exprsMat[protein,nn_indices ])
    
   p <-  ggplot(df, aes( x =thiscell_exprs ,  y = exprs , colour =  exprs  )) +
      geom_point(alpha = 0.3) + ggtitle(paste0( "Patient ", sample_name ,  " nn_corr = " ,  round(value, 2)  )) + scale_colour_viridis_c() + xlab("This cell expression") + ylab("Neighbouring cell expression")
   
   return (p ) 

}

    
p1 <- plot_nncorrelation( "IZ_P9_cond_Ischaemic" ,  "COL1A1" )
p2 <- plot_nncorrelation( "control_P1_cond_Myogenic"  ,  "COL1A1" )
ggarrange(plotlist = list( p1 , p2))

6.2 Session info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 12 (bookworm)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.21.so;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Australia/Sydney
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] reshape_0.8.9               SPOTlight_1.5.2            
##  [3] scran_1.28.1                scater_1.28.0              
##  [5] scuttle_1.10.1              spatstat.geom_3.2-1        
##  [7] spatstat.data_3.0-1         survminer_0.4.9            
##  [9] ggpubr_0.6.0                tidyr_1.3.0                
## [11] scattermore_0.8             plotly_4.10.2              
## [13] limma_3.56.2                dplyr_1.1.2                
## [15] spicyR_1.12.0               ggthemes_4.2.4             
## [17] lisaClust_1.9.2             ClassifyR_3.5.21           
## [19] survival_3.5-5              BiocParallel_1.34.2        
## [21] MultiAssayExperiment_1.26.0 generics_0.1.3             
## [23] scFeatures_0.99.27          ggplot2_3.4.2              
## [25] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
## [27] Biobase_2.60.0              GenomicRanges_1.52.0       
## [29] GenomeInfoDb_1.36.1         IRanges_2.34.1             
## [31] S4Vectors_0.38.1            BiocGenerics_0.46.0        
## [33] MatrixGenerics_1.12.2       matrixStats_1.0.0          
## 
## loaded via a namespace (and not attached):
##   [1] SpatialExperiment_1.11.2   R.methodsS3_1.8.2         
##   [3] GSEABase_1.62.0            progress_1.2.2            
##   [5] tiff_0.1-11                EnsDb.Mmusculus.v79_2.99.0
##   [7] goftest_1.2-3              DT_0.28                   
##   [9] Biostrings_2.68.1          HDF5Array_1.28.1          
##  [11] vctrs_0.6.1                spatstat.random_3.1-5     
##  [13] digest_0.6.33              png_0.1-8                 
##  [15] shape_1.4.6                EnsDb.Hsapiens.v79_2.99.0 
##  [17] registry_0.5-1             ggrepel_0.9.3             
##  [19] deldir_1.0-9               magick_2.7.4              
##  [21] MASS_7.3-59                reshape2_1.4.4            
##  [23] httpuv_1.6.11              foreach_1.5.2             
##  [25] withr_2.5.0                ggfun_0.1.1               
##  [27] xfun_0.39                  ellipsis_0.3.2            
##  [29] cytomapper_1.12.0          memoise_2.0.1             
##  [31] proxyC_0.3.3               ggbeeswarm_0.7.2          
##  [33] systemfonts_1.0.4          zoo_1.8-12                
##  [35] GlobalOptions_0.1.2        gtools_3.9.4              
##  [37] SingleCellSignalR_1.12.0   R.oo_1.25.0               
##  [39] prettyunits_1.1.1          KEGGREST_1.40.0           
##  [41] promises_1.2.0.1           httr_1.4.6                
##  [43] rstatix_0.7.2              restfulr_0.0.15           
##  [45] rhdf5filters_1.12.1        rhdf5_2.44.0              
##  [47] rstudioapi_0.14            concaveman_1.1.0          
##  [49] babelgene_22.9             curl_5.0.2                
##  [51] zlibbioc_1.46.0            ScaledMatrix_1.8.1        
##  [53] polyclip_1.10-4            GenomeInfoDbData_1.2.10   
##  [55] fftwtools_0.9-11           xtable_1.8-4              
##  [57] stringr_1.5.0              doParallel_1.0.17         
##  [59] evaluate_0.21              S4Arrays_1.0.6            
##  [61] BiocFileCache_2.8.0        hms_1.1.3                 
##  [63] irlba_2.3.5.1              colorspace_2.1-0          
##  [65] filelock_1.0.2             magrittr_2.0.3            
##  [67] later_1.3.1                viridis_0.6.3             
##  [69] lattice_0.21-8             NMF_0.26                  
##  [71] XML_3.99-0.14              cowplot_1.1.1             
##  [73] ggupset_0.3.0              RcppAnnoy_0.0.21          
##  [75] svgPanZoom_0.3.4           class_7.3-22              
##  [77] pillar_1.9.0               simpleSeg_1.2.0           
##  [79] nlme_3.1-162               EBImage_4.42.0            
##  [81] iterators_1.0.14           gridBase_0.4-7            
##  [83] caTools_1.18.2             compiler_4.3.1            
##  [85] beachmat_2.16.0            stringi_1.7.12            
##  [87] tensor_1.5                 minqa_1.2.5               
##  [89] GenomicAlignments_1.36.0   plyr_1.8.8                
##  [91] msigdbr_7.5.1              crayon_1.5.2              
##  [93] abind_1.4-5                BiocIO_1.10.0             
##  [95] sp_2.0-0                   locfit_1.5-9.8            
##  [97] terra_1.7-39               bit_4.0.5                 
##  [99] codetools_0.2-19           BiocSingular_1.16.0       
## [101] crosstalk_1.2.0            bslib_0.5.0               
## [103] multtest_2.56.0            mime_0.12                 
## [105] splines_4.3.1              circlize_0.4.15           
## [107] Rcpp_1.0.10                dbplyr_2.3.2              
## [109] sparseMatrixStats_1.12.2   knitr_1.43                
## [111] blob_1.2.4                 utf8_1.2.3                
## [113] AnnotationFilter_1.24.0    lme4_1.1-34               
## [115] nnls_1.4                   DelayedMatrixStats_1.22.1 
## [117] GSVA_1.48.2                ggsignif_0.6.4            
## [119] tibble_3.2.1               Matrix_1.5-4.1            
## [121] scam_1.2-14                statmod_1.5.0             
## [123] svglite_2.1.1              tweenr_2.0.2              
## [125] pkgconfig_2.0.3            pheatmap_1.0.12           
## [127] tools_4.3.1                cachem_1.0.8              
## [129] RSQLite_2.3.1              viridisLite_0.4.2         
## [131] DBI_1.1.3                  numDeriv_2016.8-1.1       
## [133] fastmap_1.1.1              rmarkdown_2.23            
## [135] scales_1.2.1               grid_4.3.1                
## [137] shinydashboard_0.7.2       Rsamtools_2.16.0          
## [139] broom_1.0.5                sass_0.4.6                
## [141] BiocManager_1.30.22        graph_1.78.0              
## [143] carData_3.0-5              farver_2.1.1              
## [145] scatterpie_0.2.1           mgcv_1.8-42               
## [147] yaml_2.3.7                 rtracklayer_1.60.0        
## [149] cli_3.6.1                  purrr_1.0.1               
## [151] lifecycle_1.0.3            uwot_0.1.16               
## [153] bluster_1.10.0             backports_1.4.1           
## [155] annotate_1.78.0            gtable_0.3.4              
## [157] rjson_0.2.21               parallel_4.3.1            
## [159] ape_5.7-1                  jsonlite_1.8.7            
## [161] edgeR_3.42.4               bitops_1.0-7              
## [163] bit64_4.0.5                Rtsne_0.16                
## [165] spatstat.utils_3.0-3       BiocNeighbors_1.18.0      
## [167] RcppParallel_5.1.7         jquerylib_0.1.4           
## [169] highr_0.10                 metapod_1.8.0             
## [171] dqrng_0.3.0                survMisc_0.5.6            
## [173] R.utils_2.12.2             lazyeval_0.2.2            
## [175] shiny_1.7.4                htmltools_0.5.5           
## [177] KMsurv_0.1-5               rappdirs_0.3.3            
## [179] ensembldb_2.24.0           glue_1.6.2                
## [181] XVector_0.40.0             RCurl_1.98-1.12           
## [183] jpeg_0.1-10                gridExtra_2.3             
## [185] AUCell_1.22.0              boot_1.3-28.1             
## [187] igraph_1.5.0               R6_2.5.1                  
## [189] gplots_3.1.3               km.ci_0.5-6               
## [191] labeling_0.4.2             GenomicFeatures_1.52.1    
## [193] cluster_2.1.4              rngtools_1.5.2            
## [195] Rhdf5lib_1.22.0            nloptr_2.0.3              
## [197] DelayedArray_0.26.6        tidyselect_1.2.0          
## [199] vipor_0.4.5                ProtGenerics_1.32.0       
## [201] raster_3.6-23              ggforce_0.4.1             
## [203] xml2_1.3.4                 car_3.1-2                 
## [205] AnnotationDbi_1.62.2       rsvd_1.0.5                
## [207] munsell_0.5.0              KernSmooth_2.23-21        
## [209] data.table_1.14.8          htmlwidgets_1.6.2         
## [211] RColorBrewer_1.1-3         biomaRt_2.56.1            
## [213] rlang_1.1.1                spatstat.sparse_3.0-2     
## [215] spatstat.explore_3.2-1     lmerTest_3.1-3            
## [217] fansi_1.0.4                beeswarm_0.4.0

6.3 Acknowledgment

The authors thank all their colleagues, particularly at The University of Sydney, Sydney Precision Data Science and Charles Perkins Centre for their support and intellectual engagement. Special thanks to Ellis Patrick, Shila Ghazanfar, Andy Tran for guiding and supporting the building of this workshop.